Browse Source

encoding.c: Support the UTF-8 encoding.

master
Reece H. Dunn 8 years ago
parent
commit
fa5d31a8af
3 changed files with 163 additions and 1 deletions
  1. 1
    0
      src/include/espeak-ng/espeak_ng.h
  2. 57
    1
      src/libespeak-ng/encoding.c
  3. 105
    0
      tests/encoding.c

+ 1
- 0
src/include/espeak-ng/espeak_ng.h View File

@@ -208,6 +208,7 @@ typedef enum
ESPEAKNG_ENCODING_ISO_8859_16,
ESPEAKNG_ENCODING_KOI8_R,
ESPEAKNG_ENCODING_ISCII,
ESPEAKNG_ENCODING_UTF_8,
} espeak_ng_ENCODING;

ESPEAK_NG_API espeak_ng_ENCODING

+ 57
- 1
src/libespeak-ng/encoding.c View File

@@ -25,6 +25,9 @@
#include "speech.h"
#include "encoding.h"

#define LEADING_2_BITS 0xC0 // 0b11000000
#define UTF8_TAIL_BITS 0x80 // 0b10000000

// http://www.iana.org/assignments/character-sets/character-sets.xhtml
MNEM_TAB mnem_encoding[] = {
{ "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII },
@@ -82,6 +85,7 @@ MNEM_TAB mnem_encoding[] = {
{ "Latin-9", ESPEAKNG_ENCODING_ISO_8859_15 },
{ "TIS-620", ESPEAKNG_ENCODING_ISO_8859_11 },
{ "US-ASCII", ESPEAKNG_ENCODING_US_ASCII },
{ "UTF-8", ESPEAKNG_ENCODING_UTF_8 },
{ "cp367", ESPEAKNG_ENCODING_US_ASCII },
{ "cp819", ESPEAKNG_ENCODING_ISO_8859_1 },
{ "csASCII", ESPEAKNG_ENCODING_US_ASCII },
@@ -101,6 +105,7 @@ MNEM_TAB mnem_encoding[] = {
{ "csISOLatinHebrew", ESPEAKNG_ENCODING_ISO_8859_8 },
{ "csKOI8R", ESPEAKNG_ENCODING_KOI8_R },
{ "csTIS620", ESPEAKNG_ENCODING_ISO_8859_11 },
{ "csUTF8", ESPEAKNG_ENCODING_UTF_8 },
{ "arabic", ESPEAKNG_ENCODING_ISO_8859_6 },
{ "cyrillic", ESPEAKNG_ENCODING_ISO_8859_5 },
{ "greek", ESPEAKNG_ENCODING_ISO_8859_7 },
@@ -511,6 +516,56 @@ string_decoder_getc_codepage(espeak_ng_TEXT_DECODER *decoder)
return (c >= 0x80) ? decoder->codepage[c - 0x80] : c;
}

static uint32_t
string_decoder_getc_utf_8(espeak_ng_TEXT_DECODER *decoder)
{
uint8_t c = *decoder->current++ & 0xFF;
uint32_t ret;
switch (c & 0xF0)
{
// 1-byte UTF-8 sequence
case 0x00: case 0x10: case 0x20: case 0x30:
case 0x40: case 0x50: case 0x60: case 0x70:
return c;
// UTF-8 tail byte -- invalid in isolation
case 0x80: case 0x90: case 0xA0: case 0xB0:
return 0xFFFD;
// 2-byte UTF-8 sequence
case 0xC0: case 0xD0:
if (decoder->current + 1 >= decoder->end) goto eof;
ret = c & 0x1F;
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
return ret;
// 3-byte UTF-8 sequence
case 0xE0:
if (decoder->current + 2 >= decoder->end) goto eof;
ret = c & 0x0F;
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
return ret;
// 4-byte UTF-8 sequence
case 0xF0:
if (decoder->current + 3 >= decoder->end) goto eof;
ret = c & 0x0F;
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
return (ret <= 0x10FFFF) ? ret : 0xFFFD;
}
error:
--decoder->current;
return 0xFFFD;
eof:
decoder->current = decoder->end;
return 0xFFFD;
}

typedef struct
{
uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder);
@@ -538,6 +593,7 @@ static const encoding_t string_decoders[] = {
{ string_decoder_getc_codepage, ISO_8859_16 },
{ string_decoder_getc_codepage, KOI8_R },
{ string_decoder_getc_codepage, ISCII },
{ string_decoder_getc_utf_8, NULL },
};

espeak_ng_TEXT_DECODER *
@@ -565,7 +621,7 @@ text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder,
int length,
espeak_ng_ENCODING encoding)
{
if (encoding > ESPEAKNG_ENCODING_ISCII)
if (encoding > ESPEAKNG_ENCODING_UTF_8)
return ENS_UNKNOWN_TEXT_ENCODING;

const encoding_t *enc = string_decoders + encoding;

+ 105
- 0
tests/encoding.c View File

@@ -599,6 +599,109 @@ test_iso_8859_16_encoding()
destroy_text_decoder(decoder);
}

void
test_utf_8_encoding()
{
printf("testing UTF-8 encoding\n");

assert(espeak_ng_EncodingFromName("UTF-8") == ESPEAKNG_ENCODING_UTF_8);
assert(espeak_ng_EncodingFromName("csUTF8") == ESPEAKNG_ENCODING_UTF_8);

espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

// 1-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\x0D\x1E\x20\x35\x42\x57\x65\x77", 8, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x000D);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x001E);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0020);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0035);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0042);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0057);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0065);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0077);
assert(text_decoder_eof(decoder) == 1);

// UTF-8 tail bytes without an initial length indicator character
assert(text_decoder_decode_string(decoder, "\x84\x92\xA8\xB5", 4, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 1);

// 2-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\xC2\xA0\xD0\xB0\xC5\x65\xC2\xA0", 7, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x00A0);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0430);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x65 is not a continuation byte, so \xC5 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0065);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // incomplete: \xA0 is past the end of the string
assert(text_decoder_eof(decoder) == 1);

// 3-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\xE4\xBA\x8C\xE8\x42\xE2\x93\x44\xE4\xA0\x80", 9, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x4E8C);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x42 is not a continuation byte, so \xE8 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0042);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x44 is not a continuation byte, so \xE2\x93 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0044);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // incomplete: \xA0 is past the end of the string
assert(text_decoder_eof(decoder) == 1);

// 4-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\xF0\x90\x8C\x82\xF4\x8F\xBF\xBF\xF3\x61\xF3\xA5\x32\xF3\x87\xB2\x36\xF1\xA0\x80\x80", 18, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x10302);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x10FFFF);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x61 is not a continuation byte, so \xF3 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0061);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x32 is not a continuation byte, so \xF3\xA5 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0032);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x36 is not a continuation byte, so \xF3\x87\xB2 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0036);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // incomplete: \xA0 is past the end of the string
assert(text_decoder_eof(decoder) == 1);

// out of range (> 0x10FFFF)
assert(text_decoder_decode_string(decoder, "\xF4\x90\x80\x80", 4, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 1);

destroy_text_decoder(decoder);
}

int
main(int argc, char **argv)
{
@@ -626,6 +729,8 @@ main(int argc, char **argv)
test_iso_8859_15_encoding();
test_iso_8859_16_encoding();

test_utf_8_encoding();

printf("done\n");

return EXIT_SUCCESS;

Loading…
Cancel
Save