Browse Source

encoding.c: Implement support for the auto-detected character set (utf-8 + codepoint-encoding).

master
Reece H. Dunn 8 years ago
parent
commit
d167d5649b
4 changed files with 84 additions and 3 deletions
  1. 33
    0
      src/libespeak-ng/encoding.c
  2. 6
    0
      src/libespeak-ng/encoding.h
  3. 10
    3
      src/libespeak-ng/translate.c
  4. 35
    0
      tests/encoding.c

+ 33
- 0
src/libespeak-ng/encoding.c View File

@@ -602,6 +602,19 @@ string_decoder_getc_wchar(espeak_ng_TEXT_DECODER *decoder)
return c;
}

static uint32_t
string_decoder_getc_auto(espeak_ng_TEXT_DECODER *decoder)
{
const uint8_t *ptr = decoder->current;
uint32_t c = string_decoder_getc_utf_8(decoder);
if (c == 0xFFFD) {
decoder->get = string_decoder_getc_codepage;
decoder->current = ptr;
c = decoder->get(decoder);
}
return c;
}

typedef struct
{
uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder);
@@ -672,6 +685,26 @@ text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder,
return ENS_OK;
}

espeak_ng_STATUS
text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder,
const char *string,
int length,
espeak_ng_ENCODING encoding)
{
if (encoding > ESPEAKNG_ENCODING_ISO_10646_UCS_2)
return ENS_UNKNOWN_TEXT_ENCODING;

const encoding_t *enc = string_decoders + encoding;
if (enc->get == NULL)
return ENS_UNKNOWN_TEXT_ENCODING;

decoder->get = string_decoder_getc_auto;
decoder->codepage = enc->codepage;
decoder->current = (const uint8_t *)string;
decoder->end = (const uint8_t *)(string + length);
return ENS_OK;
}

espeak_ng_STATUS
text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder,
const wchar_t *string,

+ 6
- 0
src/libespeak-ng/encoding.h View File

@@ -36,6 +36,12 @@ text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder,
int length,
espeak_ng_ENCODING encoding);

espeak_ng_STATUS
text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder,
const char *string,
int length,
espeak_ng_ENCODING encoding);

espeak_ng_STATUS
text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder,
const wchar_t *string,

+ 10
- 3
src/libespeak-ng/translate.c View File

@@ -1916,16 +1916,21 @@ int UpperCaseInWord(Translator *tr, char *word, int c)
return 0;
}

static espeak_ng_STATUS init_wstring_decoder(const wchar_t *text)
static inline espeak_ng_STATUS init_wstring_decoder(const wchar_t *text)
{
return text_decoder_decode_wstring(p_decoder, text, wcslen(text) + 1);
}

static espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding)
static inline espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding)
{
return text_decoder_decode_string(p_decoder, text, strlen(text) + 1, encoding);
}

static inline espeak_ng_STATUS init_string_decoder_auto(const char *text, espeak_ng_ENCODING encoding)
{
return text_decoder_decode_string_auto(p_decoder, text, strlen(text) + 1, encoding);
}

const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out, char **voice_change)
{
int ix;
@@ -1985,7 +1990,9 @@ const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out,
case espeakCHARS_WCHAR:
init_wstring_decoder((const wchar_t *)vp_input);
break;
case espeakCHARS_AUTO: // TODO: Implement UTF-8 => 8BIT fallback on 0xFFFD UTF-8 characters.
case espeakCHARS_AUTO:
init_string_decoder_auto((const char *)vp_input, tr->encoding);
break;
case espeakCHARS_UTF8:
init_string_decoder((const char *)vp_input, ESPEAKNG_ENCODING_UTF_8);
break;

+ 35
- 0
tests/encoding.c View File

@@ -749,6 +749,40 @@ test_wchar_decoder()
destroy_text_decoder(decoder);
}

void
test_auto_decoder()
{
printf("testing auto decoder (UTF-8 + codepage-based fallback)\n");

espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

// UTF-8
assert(text_decoder_decode_string_auto(decoder, "aG\xC2\xA0 ", 5, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'a');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'G');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xA0);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == ' ');
assert(text_decoder_eof(decoder) == 1);

// ISO-8859-1
assert(text_decoder_decode_string_auto(decoder, "aG\240f", 4, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'a');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'G');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xA0);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'f');
assert(text_decoder_eof(decoder) == 1);

destroy_text_decoder(decoder);
}

int
main(int argc, char **argv)
{
@@ -780,6 +814,7 @@ main(int argc, char **argv)
test_iso_10646_ucs_2_encoding();

test_wchar_decoder();
test_auto_decoder();

printf("done\n");


Loading…
Cancel
Save