Browse Source

encoding.c: Implement support for the auto-detected character set (utf-8 + codepoint-encoding).

master
Reece H. Dunn 8 years ago
parent
commit
d167d5649b
4 changed files with 84 additions and 3 deletions
  1. 33
    0
      src/libespeak-ng/encoding.c
  2. 6
    0
      src/libespeak-ng/encoding.h
  3. 10
    3
      src/libespeak-ng/translate.c
  4. 35
    0
      tests/encoding.c

+ 33
- 0
src/libespeak-ng/encoding.c View File

return c; return c;
} }


static uint32_t
string_decoder_getc_auto(espeak_ng_TEXT_DECODER *decoder)
{
const uint8_t *ptr = decoder->current;
uint32_t c = string_decoder_getc_utf_8(decoder);
if (c == 0xFFFD) {
decoder->get = string_decoder_getc_codepage;
decoder->current = ptr;
c = decoder->get(decoder);
}
return c;
}

typedef struct typedef struct
{ {
uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder); uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder);
return ENS_OK; return ENS_OK;
} }


espeak_ng_STATUS
text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder,
const char *string,
int length,
espeak_ng_ENCODING encoding)
{
if (encoding > ESPEAKNG_ENCODING_ISO_10646_UCS_2)
return ENS_UNKNOWN_TEXT_ENCODING;

const encoding_t *enc = string_decoders + encoding;
if (enc->get == NULL)
return ENS_UNKNOWN_TEXT_ENCODING;

decoder->get = string_decoder_getc_auto;
decoder->codepage = enc->codepage;
decoder->current = (const uint8_t *)string;
decoder->end = (const uint8_t *)(string + length);
return ENS_OK;
}

espeak_ng_STATUS espeak_ng_STATUS
text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder, text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder,
const wchar_t *string, const wchar_t *string,

+ 6
- 0
src/libespeak-ng/encoding.h View File

int length, int length,
espeak_ng_ENCODING encoding); espeak_ng_ENCODING encoding);


espeak_ng_STATUS
text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder,
const char *string,
int length,
espeak_ng_ENCODING encoding);

espeak_ng_STATUS espeak_ng_STATUS
text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder, text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder,
const wchar_t *string, const wchar_t *string,

+ 10
- 3
src/libespeak-ng/translate.c View File

return 0; return 0;
} }


static espeak_ng_STATUS init_wstring_decoder(const wchar_t *text)
static inline espeak_ng_STATUS init_wstring_decoder(const wchar_t *text)
{ {
return text_decoder_decode_wstring(p_decoder, text, wcslen(text) + 1); return text_decoder_decode_wstring(p_decoder, text, wcslen(text) + 1);
} }


static espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding)
static inline espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding)
{ {
return text_decoder_decode_string(p_decoder, text, strlen(text) + 1, encoding); return text_decoder_decode_string(p_decoder, text, strlen(text) + 1, encoding);
} }


static inline espeak_ng_STATUS init_string_decoder_auto(const char *text, espeak_ng_ENCODING encoding)
{
return text_decoder_decode_string_auto(p_decoder, text, strlen(text) + 1, encoding);
}

const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out, char **voice_change) const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out, char **voice_change)
{ {
int ix; int ix;
case espeakCHARS_WCHAR: case espeakCHARS_WCHAR:
init_wstring_decoder((const wchar_t *)vp_input); init_wstring_decoder((const wchar_t *)vp_input);
break; break;
case espeakCHARS_AUTO: // TODO: Implement UTF-8 => 8BIT fallback on 0xFFFD UTF-8 characters.
case espeakCHARS_AUTO:
init_string_decoder_auto((const char *)vp_input, tr->encoding);
break;
case espeakCHARS_UTF8: case espeakCHARS_UTF8:
init_string_decoder((const char *)vp_input, ESPEAKNG_ENCODING_UTF_8); init_string_decoder((const char *)vp_input, ESPEAKNG_ENCODING_UTF_8);
break; break;

+ 35
- 0
tests/encoding.c View File

destroy_text_decoder(decoder); destroy_text_decoder(decoder);
} }


void
test_auto_decoder()
{
printf("testing auto decoder (UTF-8 + codepage-based fallback)\n");

espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

// UTF-8
assert(text_decoder_decode_string_auto(decoder, "aG\xC2\xA0 ", 5, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'a');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'G');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xA0);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == ' ');
assert(text_decoder_eof(decoder) == 1);

// ISO-8859-1
assert(text_decoder_decode_string_auto(decoder, "aG\240f", 4, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'a');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'G');
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 0xA0);
assert(text_decoder_eof(decoder) == 0);
assert(text_decoder_getc(decoder) == 'f');
assert(text_decoder_eof(decoder) == 1);

destroy_text_decoder(decoder);
}

int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
test_iso_10646_ucs_2_encoding(); test_iso_10646_ucs_2_encoding();


test_wchar_decoder(); test_wchar_decoder();
test_auto_decoder();


printf("done\n"); printf("done\n");



Loading…
Cancel
Save