Browse Source

encoding.c: Support the UTF-8 encoding.

master
Reece H. Dunn 8 years ago
parent
commit
fa5d31a8af
3 changed files with 163 additions and 1 deletions
  1. 1
    0
      src/include/espeak-ng/espeak_ng.h
  2. 57
    1
      src/libespeak-ng/encoding.c
  3. 105
    0
      tests/encoding.c

+ 1
- 0
src/include/espeak-ng/espeak_ng.h View File

ESPEAKNG_ENCODING_ISO_8859_16, ESPEAKNG_ENCODING_ISO_8859_16,
ESPEAKNG_ENCODING_KOI8_R, ESPEAKNG_ENCODING_KOI8_R,
ESPEAKNG_ENCODING_ISCII, ESPEAKNG_ENCODING_ISCII,
ESPEAKNG_ENCODING_UTF_8,
} espeak_ng_ENCODING; } espeak_ng_ENCODING;


ESPEAK_NG_API espeak_ng_ENCODING ESPEAK_NG_API espeak_ng_ENCODING

+ 57
- 1
src/libespeak-ng/encoding.c View File

#include "speech.h" #include "speech.h"
#include "encoding.h" #include "encoding.h"


#define LEADING_2_BITS 0xC0 // 0b11000000
#define UTF8_TAIL_BITS 0x80 // 0b10000000

// http://www.iana.org/assignments/character-sets/character-sets.xhtml // http://www.iana.org/assignments/character-sets/character-sets.xhtml
MNEM_TAB mnem_encoding[] = { MNEM_TAB mnem_encoding[] = {
{ "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII }, { "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII },
{ "Latin-9", ESPEAKNG_ENCODING_ISO_8859_15 }, { "Latin-9", ESPEAKNG_ENCODING_ISO_8859_15 },
{ "TIS-620", ESPEAKNG_ENCODING_ISO_8859_11 }, { "TIS-620", ESPEAKNG_ENCODING_ISO_8859_11 },
{ "US-ASCII", ESPEAKNG_ENCODING_US_ASCII }, { "US-ASCII", ESPEAKNG_ENCODING_US_ASCII },
{ "UTF-8", ESPEAKNG_ENCODING_UTF_8 },
{ "cp367", ESPEAKNG_ENCODING_US_ASCII }, { "cp367", ESPEAKNG_ENCODING_US_ASCII },
{ "cp819", ESPEAKNG_ENCODING_ISO_8859_1 }, { "cp819", ESPEAKNG_ENCODING_ISO_8859_1 },
{ "csASCII", ESPEAKNG_ENCODING_US_ASCII }, { "csASCII", ESPEAKNG_ENCODING_US_ASCII },
{ "csISOLatinHebrew", ESPEAKNG_ENCODING_ISO_8859_8 }, { "csISOLatinHebrew", ESPEAKNG_ENCODING_ISO_8859_8 },
{ "csKOI8R", ESPEAKNG_ENCODING_KOI8_R }, { "csKOI8R", ESPEAKNG_ENCODING_KOI8_R },
{ "csTIS620", ESPEAKNG_ENCODING_ISO_8859_11 }, { "csTIS620", ESPEAKNG_ENCODING_ISO_8859_11 },
{ "csUTF8", ESPEAKNG_ENCODING_UTF_8 },
{ "arabic", ESPEAKNG_ENCODING_ISO_8859_6 }, { "arabic", ESPEAKNG_ENCODING_ISO_8859_6 },
{ "cyrillic", ESPEAKNG_ENCODING_ISO_8859_5 }, { "cyrillic", ESPEAKNG_ENCODING_ISO_8859_5 },
{ "greek", ESPEAKNG_ENCODING_ISO_8859_7 }, { "greek", ESPEAKNG_ENCODING_ISO_8859_7 },
return (c >= 0x80) ? decoder->codepage[c - 0x80] : c; return (c >= 0x80) ? decoder->codepage[c - 0x80] : c;
} }


static uint32_t
string_decoder_getc_utf_8(espeak_ng_TEXT_DECODER *decoder)
{
uint8_t c = *decoder->current++ & 0xFF;
uint32_t ret;
switch (c & 0xF0)
{
// 1-byte UTF-8 sequence
case 0x00: case 0x10: case 0x20: case 0x30:
case 0x40: case 0x50: case 0x60: case 0x70:
return c;
// UTF-8 tail byte -- invalid in isolation
case 0x80: case 0x90: case 0xA0: case 0xB0:
return 0xFFFD;
// 2-byte UTF-8 sequence
case 0xC0: case 0xD0:
if (decoder->current + 1 >= decoder->end) goto eof;
ret = c & 0x1F;
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
return ret;
// 3-byte UTF-8 sequence
case 0xE0:
if (decoder->current + 2 >= decoder->end) goto eof;
ret = c & 0x0F;
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
return ret;
// 4-byte UTF-8 sequence
case 0xF0:
if (decoder->current + 3 >= decoder->end) goto eof;
ret = c & 0x0F;
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
if (((c = *decoder->current++ & 0xFF) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
ret = (ret << 6) + (c & 0x3F);
return (ret <= 0x10FFFF) ? ret : 0xFFFD;
}
error:
--decoder->current;
return 0xFFFD;
eof:
decoder->current = decoder->end;
return 0xFFFD;
}

typedef struct typedef struct
{ {
uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder); uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder);
{ string_decoder_getc_codepage, ISO_8859_16 }, { string_decoder_getc_codepage, ISO_8859_16 },
{ string_decoder_getc_codepage, KOI8_R }, { string_decoder_getc_codepage, KOI8_R },
{ string_decoder_getc_codepage, ISCII }, { string_decoder_getc_codepage, ISCII },
{ string_decoder_getc_utf_8, NULL },
}; };


espeak_ng_TEXT_DECODER * espeak_ng_TEXT_DECODER *
int length, int length,
espeak_ng_ENCODING encoding) espeak_ng_ENCODING encoding)
{ {
if (encoding > ESPEAKNG_ENCODING_ISCII)
if (encoding > ESPEAKNG_ENCODING_UTF_8)
return ENS_UNKNOWN_TEXT_ENCODING; return ENS_UNKNOWN_TEXT_ENCODING;


const encoding_t *enc = string_decoders + encoding; const encoding_t *enc = string_decoders + encoding;

+ 105
- 0
tests/encoding.c View File

destroy_text_decoder(decoder); destroy_text_decoder(decoder);
} }


void
test_utf_8_encoding()
{
printf("testing UTF-8 encoding\n");

assert(espeak_ng_EncodingFromName("UTF-8") == ESPEAKNG_ENCODING_UTF_8);
assert(espeak_ng_EncodingFromName("csUTF8") == ESPEAKNG_ENCODING_UTF_8);

espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

// 1-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\x0D\x1E\x20\x35\x42\x57\x65\x77", 8, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x000D);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x001E);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0020);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0035);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0042);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0057);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0065);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0077);
assert(text_decoder_eof(decoder) == 1);

// UTF-8 tail bytes without an initial length indicator character
assert(text_decoder_decode_string(decoder, "\x84\x92\xA8\xB5", 4, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 1);

// 2-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\xC2\xA0\xD0\xB0\xC5\x65\xC2\xA0", 7, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x00A0);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0430);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x65 is not a continuation byte, so \xC5 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0065);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // incomplete: \xA0 is past the end of the string
assert(text_decoder_eof(decoder) == 1);

// 3-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\xE4\xBA\x8C\xE8\x42\xE2\x93\x44\xE4\xA0\x80", 9, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x4E8C);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x42 is not a continuation byte, so \xE8 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0042);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x44 is not a continuation byte, so \xE2\x93 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0044);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // incomplete: \xA0 is past the end of the string
assert(text_decoder_eof(decoder) == 1);

// 4-byte UTF-8 sequences
assert(text_decoder_decode_string(decoder, "\xF0\x90\x8C\x82\xF4\x8F\xBF\xBF\xF3\x61\xF3\xA5\x32\xF3\x87\xB2\x36\xF1\xA0\x80\x80", 18, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x10302);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x10FFFF);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x61 is not a continuation byte, so \xF3 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0061);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x32 is not a continuation byte, so \xF3\xA5 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0032);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // \x36 is not a continuation byte, so \xF3\x87\xB2 is invalid
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0x0036);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD); // incomplete: \xA0 is past the end of the string
assert(text_decoder_eof(decoder) == 1);

// out of range (> 0x10FFFF)
assert(text_decoder_decode_string(decoder, "\xF4\x90\x80\x80", 4, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xFFFD);
assert(text_decoder_eof(decoder) == 1);

destroy_text_decoder(decoder);
}

int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
test_iso_8859_15_encoding(); test_iso_8859_15_encoding();
test_iso_8859_16_encoding(); test_iso_8859_16_encoding();


test_utf_8_encoding();

printf("done\n"); printf("done\n");


return EXIT_SUCCESS; return EXIT_SUCCESS;

Loading…
Cancel
Save